Goal 1: time and memory for one full run

bench1 <- fs::dir_ls(path = "one-run/", glob = "*tsv") %>%
  vroom(id = "path") %>%
  mutate(path = str_remove(path, "one-run/train_ver9_")) %>%
  mutate(path = str_remove(path, "_25.07.2023-Q2-2023.5.tsv")) %>%
  separate(path, c("id", "type"), sep = "_", extra = "merge", remove = F) |>
  mutate(mem_GBs = max_vms/1024) |>
  left_join(qzas)
## Rows: 8 Columns: 11
## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl  (9): s, max_rss, max_vms, max_uss, max_pss, io_in, io_out, mean_load, c...
## time (1): h:m:s
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 2 rows [1, 5].
## Joining with `by = join_by(path)`
plotcolors <- scale_fill_brewer(palette = "Set2")

bench1 %>% ggplot(aes(x = type, y = s / 60 / 60, fill = id)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Runtime in hours") +
  plotcolors

bench1 %>% ggplot(aes(x = type, y = max_rss / 1000, fill = id)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Maximum memory usage in GBs") +
  plotcolors

Goal 1.A What are all these memories?

Full summary on StackOverflow.

bench1 |>
  pivot_longer(starts_with("max"), values_to = "MB") |>
  ggplot(aes(x = paste(id, type), y = MB/1024, fill = name)) +
  geom_bar(stat = "identity", position = "dodge")

Goal 2: time and memory ~ reads-per-chunk

Parameter sweep reads-per-chunk settings for skl-classifiers.

What read numbers should we test, along a log scale?

data.frame(x = 3 + (0:10) / 10) %>%
  mutate(y = round(10^x))
##      x     y
## 1  3.0  1000
## 2  3.1  1259
## 3  3.2  1585
## 4  3.3  1995
## 5  3.4  2512
## 6  3.5  3162
## 7  3.6  3981
## 8  3.7  5012
## 9  3.8  6310
## 10 3.9  7943
## 11 4.0 10000
bench2 <- fs::dir_ls(path = "batch-size/", glob = "*tsv") %>%
  vroom(id = "path") %>%
  mutate(path = str_remove(path, "batch-size/train_ver9_99_25.07.2023-Q2-2023.5-chunk")) %>%
  mutate(path = str_remove(path, ".tsv")) %>%
  mutate(path = as.numeric(path)) %>%
  mutate(type = as.factor(as.numeric(path)))
## Rows: 16 Columns: 11
## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl  (9): s, max_rss, max_vms, max_uss, max_pss, io_in, io_out, mean_load, c...
## time (1): h:m:s
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
plotcolors <- scale_fill_brewer(palette = "Set2")

bench2 %>% ggplot(aes(x = path, y = s / 60 / 60)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_x_log10() +
  labs(title = "Runtime in hours") +
  plotcolors

bench2 %>% ggplot(aes(x = type, y = max_rss / 1000)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Maximum memory usage in GBs") +
  plotcolors

Goal 3: What scales with memory?

bench1 |>
  ggscatter(x = "s", y = "mem_GBs", add = "reg.line") +
  # scale_x_log10() + scale_y_log10() +
  stat_regline_equation(label.y = 1.2e2) +
  stat_cor(label.y = 1.1e2)

bench1 |>
  ggscatter(x = "io_in", y = "mem_GBs", add = "reg.line") +
  # scale_x_log10() + scale_y_log10() +
  stat_regline_equation(label.x = 7777, label.y = 1.2e2) +
  stat_cor(label.x = 7777, , label.y = 1.1e2)

bench1 |>
  ggscatter(x = "size", y = "max_vms", add = "reg.line") +
  # scale_x_log10() + scale_y_log10() +
  stat_regline_equation(label.x = 10, label.y = 1.2e5) +
  stat_cor(label.x = 10, , label.y = 1.1e5)

bench1 |>
  mutate(GBsOut = io_out/1024) |>
  ggscatter(x = "GBsOut", y = "mem_GBs", add = "reg.line") +
  # scale_x_log10() + scale_y_log10() +
  stat_regline_equation(label.x = 10, label.y = 1.2e2) +
  stat_cor(label.x = 10, , label.y = 1.1e2)

Ladies and gentlemen, got ‘em’

ggplotly(
  bench1 |>
    mutate(GBsOut = io_out/1024) |>
    ggplot(aes(x = GBsOut, y = mem_GBs, color = type, shape = id)) +
    geom_point(size = 3) +
    geom_line(alpha = 0.5, size = 2, aes(group = type)) +
    scale_x_log10() + scale_y_log10()
)